# -*- coding: utf-8 -*-
# @Time: 2025/6/29 21:54
# @Author: wzd
# @Email: 2146333089@qq.com
# @File: saveFormat.py
import pandas as pd
import json
from pathlib import Path


def save_to_unified_format(sections, source_file):
    """将结构化数据保存为统一格式"""
    # 构建元数据
    metadata = {
        "source": source_file.name,
        "format": source_file.suffix[1:],
        "total_sections": len(sections),
        "processing_date": pd.Timestamp.now().isoformat()
    }

    # 创建DataFrame
    records = []
    for idx, section in enumerate(sections):
        records.append({
            "section_id": f"{source_file.stem}_{idx}",
            "title": section["title"],
            "level": section["level"],
            "content": "\n".join(section["content"]),
            "source_file": source_file.name,
            "section_order": idx
        })

    df = pd.DataFrame(records)

    # 保存为Parquet（列式存储，高效）
    df.to_parquet(f"./统一存储/{source_file.stem}.parquet")

    # 保存元数据
    with open(f"./统一存储/{source_file.stem}_meta.json", "w") as f:
        json.dump(metadata, f, ensure_ascii=False)